library(tidyverse)
library(ggplot2)
library(lavaan)
library(car)
library(caret)
library(ggRandomForests)
library(VSURF)
library(glmnet)
library(Boruta)
library(doParallel)VSURF
Data set
This data set is from the 2015 Asian American Quality of Life survey. Participants are from Austin, Texas.
Input data set
qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |>
mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
`English Speaking`=relevel(`English Speaking`,ref="Not at all"),
Ethnicity = relevel(Ethnicity,ref="Chinese"),
Religion=relevel(Religion,ref="None")) |>
mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
"$10,000 - $19,999" ~"Below",
"$20,000 - $29,999"~"Below",
"$30,000 - $39,999"~"Below",
"$40,000 - $49,999"~"Below",
"$50,000 - $59,999"~"Below",
"$60,000 - $69,999"~"Above",
"$70,000 and over"~"Above",
.default=Income)) |>
mutate(Income_median = factor(Income_median, levels=c("Below","Above"))) |>
mutate(across(`Familiarity with America`:`Familiarity with Ethnic Origin`,~factor(.x,levels=c("Very low","Low", "High", "Very high"))),
across(`Identify Ethnically`,~factor(.x,levels=c("Not at all","Not very close","Somewhat close","Very close"))),
across(`Belonging`,~factor(.x,levels=c("Not at all","Not very much","Somewhat","Very much"))),
`Primary Language` = as.factor(`Primary Language`))New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
qol |> DT::datatable()Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html
Family
rfdata <- qol |> filter(Family %in% c("No","Yes")) |>
mutate(Family=droplevels(Family)) |>
select(Family, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |>
na.omit() |>
as.data.frame() |>
rename_with(make.names)
imbal <- ROSE::ROSE(Family~.,
data=rfdata,
seed=3)$data
# VSURF(Family~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
VSURF(Family~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.modWarning in VSURF.formula(Family ~ ., rfdata, na.action = "na.omit", parallel = T, : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 11.6 secs
VSURF selected:
18 variables at thresholding step (in 6.2 secs)
3 variables at interpretation step (in 3.6 secs)
3 variables at prediction step (in 1.7 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "Age" "Ethnicity" "English.Difficulties"
names(rfdata[,-1])[vsurf.mod$varselect.interp][1] "Age" "Ethnicity" "English.Difficulties"
plot(vsurf.mod)
vsurf.mod$mean.perf[1] 0.4059747
Importance
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Age 0.02655 0.00070 black
2 Ethnicity 0.01326 0.00093 red
3 English.Difficulties 0.00791 0.00061 black
4 Duration.of.Residency 0.00531 0.00071 black
5 US.Born 0.00429 0.00041 black
6 English.Speaking 0.00305 0.00054 black
7 Full.Time.Employment 0.00286 0.00045 black
8 Gender 0.00269 0.00052 black
9 Primary.Language 0.00262 0.00039 black
10 Belonging 0.00240 0.00071 black
11 Identify.Ethnically 0.00239 0.00052 black
12 Discrimination 0.00230 0.00046 black
13 Dental.Insurance 0.00208 0.00031 black
14 Religion 0.00189 0.00069 black
15 Income_median 0.00188 0.00032 black
16 Health.Insurance 0.00129 0.00018 black
17 Familiarity.with.Ethnic.Origin 0.00092 0.00046 black
18 Familiarity.with.America 0.00068 0.00048 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)
ggsave(filename = "VSURF_importance_family.png", width=8, height=12,units="in")Logistic regression (Interpretation)
lr <- rfdata |> select(Family,names(rfdata[,-1])[vsurf.mod$varselect.pred])
lr_mod <- glm(Family~.,family=binomial,data=lr)
summary(lr_mod)
Call:
glm(formula = Family ~ ., family = binomial, data = lr)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.039877 0.164542 6.320 2.62e-10 ***
Age -0.013484 0.002900 -4.650 3.32e-06 ***
EthnicityAsian Indian -0.445283 0.138338 -3.219 0.001287 **
EthnicityFilipino -0.279527 0.175999 -1.588 0.112234
EthnicityKorean -0.648903 0.142419 -4.556 5.21e-06 ***
EthnicityOther -0.851280 0.220476 -3.861 0.000113 ***
EthnicityVietnamese -0.729705 0.142115 -5.135 2.83e-07 ***
English.DifficultiesMuch 0.458596 0.134518 3.409 0.000652 ***
English.DifficultiesNot much -0.001948 0.122826 -0.016 0.987345
English.DifficultiesVery much -0.270289 0.133988 -2.017 0.043668 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2735.1 on 1974 degrees of freedom
Residual deviance: 2655.2 on 1965 degrees of freedom
AIC: 2675.2
Number of Fisher Scoring iterations: 4
car::Anova(lr_mod)Analysis of Deviance Table (Type II tests)
Response: Family
LR Chisq Df Pr(>Chisq)
Age 21.896 1 2.878e-06 ***
Ethnicity 38.682 5 2.752e-07 ***
English.Difficulties 24.968 3 1.568e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
broom::tidy(lr_mod,exponentiate=T,conf.int=T) |> DT::datatable()CV LASSO
x <- makeX(rfdata[,-1])
y <- rfdata$Family
cv.glmnet(x,y,family="binomial", type.measure = "auc")-> modcv
modcv$lambda.1se[1] 0.01749015
coef(modcv,s="lambda.1se")55 x 1 sparse Matrix of class "dgCMatrix"
s1
(Intercept) 5.081756e-01
EthnicityChinese 3.446551e-01
EthnicityAsian Indian .
EthnicityFilipino 2.000085e-02
EthnicityKorean .
EthnicityOther -4.435060e-02
EthnicityVietnamese -5.947637e-02
Age -6.976510e-03
GenderFemale -8.492122e-02
GenderMale 6.220380e-15
ReligionNone .
ReligionBuddhist .
ReligionCatholic .
ReligionHindu .
ReligionMuslim .
ReligionOther .
ReligionProtestant .
Full.Time.Employment0 2.059562e-01
Full.Time.EmploymentEmployed full time -3.548327e-04
Income_medianBelow .
Income_medianAbove .
US.BornNo -4.251606e-01
US.BornYes .
Duration.of.Residency -1.110031e-03
Primary.Language0 .
Primary.Language1 .
English.SpeakingNot at all .
English.SpeakingNot well .
English.SpeakingVery well .
English.SpeakingWell .
English.DifficultiesNot at all .
English.DifficultiesMuch 3.079440e-01
English.DifficultiesNot much .
English.DifficultiesVery much -9.182944e-03
Familiarity.with.AmericaVery low .
Familiarity.with.AmericaLow .
Familiarity.with.AmericaHigh .
Familiarity.with.AmericaVery high .
Familiarity.with.Ethnic.OriginVery low .
Familiarity.with.Ethnic.OriginLow .
Familiarity.with.Ethnic.OriginHigh .
Familiarity.with.Ethnic.OriginVery high .
Identify.EthnicallyNot at all -4.858230e-01
Identify.EthnicallyNot very close .
Identify.EthnicallySomewhat close .
Identify.EthnicallyVery close .
BelongingNot at all -1.371431e-04
BelongingNot very much .
BelongingSomewhat .
BelongingVery much 1.028260e-02
Discrimination 1.220443e-01
Health.Insurance0 .
Health.InsuranceYes .
Dental.Insurance0 7.624372e-02
Dental.InsuranceYes -4.376660e-14
print(modcv)
Call: cv.glmnet(x = x, y = y, type.measure = "auc", family = "binomial")
Measure: AUC
Lambda Index Measure SE Nonzero
min 0.005727 26 0.6277 0.009836 33
1se 0.017490 14 0.6184 0.010205 19
Health Professionals
rfdata <- qol |> select(`Heal Professionals`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |>
na.omit() |>
as.data.frame() |>
rename_with(make.names)
imbal <- ROSE::ROSE(Heal.Professionals~.,
data=rfdata,
seed=3)$data
# VSURF(Heal.Professionals~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
VSURF(Heal.Professionals~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.modWarning in VSURF.formula(Heal.Professionals ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 13.3 secs
VSURF selected:
15 variables at thresholding step (in 6.4 secs)
6 variables at interpretation step (in 2.8 secs)
3 variables at prediction step (in 4.1 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "Duration.of.Residency" "English.Speaking" "English.Difficulties"
names(rfdata[,-1])[vsurf.mod$varselect.interp][1] "Duration.of.Residency" "English.Speaking" "English.Difficulties"
[4] "Dental.Insurance" "Health.Insurance" "Ethnicity"
plot(vsurf.mod)
vsurf.mod$mean.perf[1] 0.3803138
Importance
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Duration.of.Residency 0.01576 0.00120 black
2 English.Speaking 0.01511 0.00108 black
3 English.Difficulties 0.00605 0.00079 black
4 Dental.Insurance 0.00565 0.00035 black
5 Health.Insurance 0.00533 0.00036 black
6 Ethnicity 0.00510 0.00069 red
7 Religion 0.00481 0.00070 black
8 Age 0.00441 0.00059 black
9 Income_median 0.00311 0.00042 black
10 Familiarity.with.America 0.00287 0.00053 black
11 Discrimination 0.00245 0.00038 black
12 Familiarity.with.Ethnic.Origin 0.00231 0.00069 black
13 Identify.Ethnically 0.00211 0.00052 black
14 Primary.Language 0.00110 0.00043 black
15 Belonging 0.00078 0.00050 black
16 Full.Time.Employment 0.00031 0.00039 black
17 US.Born -0.00023 0.00019 black
18 Gender -0.00030 0.00038 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)
ggsave(filename = "VSURF_importance_hp.png", width=12, height=8,units="in")Logistic regression (Interpretation)
lr <- rfdata |> select(Heal.Professionals,names(rfdata[,-1])[vsurf.mod$varselect.pred])
lr_mod <- glm(Heal.Professionals~.,family=binomial,data=lr)
summary(lr_mod)
Call:
glm(formula = Heal.Professionals ~ ., family = binomial, data = lr)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.884624 0.251110 -3.523 0.000427 ***
Duration.of.Residency 0.028505 0.004013 7.102 1.23e-12 ***
English.SpeakingNot well 0.227614 0.251086 0.907 0.364662
English.SpeakingVery well 1.064288 0.242697 4.385 1.16e-05 ***
English.SpeakingWell 0.762795 0.241370 3.160 0.001576 **
English.DifficultiesMuch -0.361097 0.159166 -2.269 0.023288 *
English.DifficultiesNot much -0.267057 0.147068 -1.816 0.069389 .
English.DifficultiesVery much -0.275771 0.139962 -1.970 0.048801 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2736.5 on 1975 degrees of freedom
Residual deviance: 2557.1 on 1968 degrees of freedom
AIC: 2573.1
Number of Fisher Scoring iterations: 4
car::Anova(lr_mod)Analysis of Deviance Table (Type II tests)
Response: Heal.Professionals
LR Chisq Df Pr(>Chisq)
Duration.of.Residency 52.489 1 4.327e-13 ***
English.Speaking 42.704 3 2.844e-09 ***
English.Difficulties 6.501 3 0.08962 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
broom::tidy(lr_mod,exponentiate=T,conf.int=T)|> DT::datatable()CV LASSO
x <- makeX(rfdata[,-1])
y <- rfdata$Heal.Professionals
cv.glmnet(x,y,family="binomial", type.measure = "auc")-> modcv
modcv$lambda.1se[1] 0.0289948
coef(modcv,s="lambda.1se")55 x 1 sparse Matrix of class "dgCMatrix"
s1
(Intercept) -1.868095e-01
EthnicityChinese .
EthnicityAsian Indian .
EthnicityFilipino 1.763755e-01
EthnicityKorean -7.409358e-02
EthnicityOther .
EthnicityVietnamese .
Age .
GenderFemale .
GenderMale .
ReligionNone .
ReligionBuddhist .
ReligionCatholic .
ReligionHindu .
ReligionMuslim .
ReligionOther .
ReligionProtestant .
Full.Time.Employment0 .
Full.Time.EmploymentEmployed full time .
Income_medianBelow .
Income_medianAbove .
US.BornNo .
US.BornYes .
Duration.of.Residency 1.906691e-02
Primary.Language0 .
Primary.Language1 .
English.SpeakingNot at all .
English.SpeakingNot well -2.112454e-01
English.SpeakingVery well 1.940132e-01
English.SpeakingWell .
English.DifficultiesNot at all 1.617571e-01
English.DifficultiesMuch .
English.DifficultiesNot much .
English.DifficultiesVery much .
Familiarity.with.AmericaVery low .
Familiarity.with.AmericaLow .
Familiarity.with.AmericaHigh .
Familiarity.with.AmericaVery high 8.675712e-02
Familiarity.with.Ethnic.OriginVery low -8.681946e-02
Familiarity.with.Ethnic.OriginLow .
Familiarity.with.Ethnic.OriginHigh .
Familiarity.with.Ethnic.OriginVery high .
Identify.EthnicallyNot at all .
Identify.EthnicallyNot very close .
Identify.EthnicallySomewhat close .
Identify.EthnicallyVery close .
BelongingNot at all .
BelongingNot very much .
BelongingSomewhat .
BelongingVery much .
Discrimination .
Health.Insurance0 -3.599801e-01
Health.InsuranceYes 1.102124e-13
Dental.Insurance0 -2.204331e-01
Dental.InsuranceYes 6.410888e-14
print(modcv)
Call: cv.glmnet(x = x, y = y, type.measure = "auc", family = "binomial")
Measure: AUC
Lambda Index Measure SE Nonzero
min 0.01042 26 0.6903 0.006749 23
1se 0.02899 15 0.6846 0.007234 12
Physical Check-up
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Physical Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imbal <- ROSE::ROSE(Physical.Check.up~.,
data=rfdata,
seed=3)$data
# VSURF(Physical.Check.up~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
VSURF(Physical.Check.up~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.modWarning in VSURF.formula(Physical.Check.up ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 11.4 secs
VSURF selected:
17 variables at thresholding step (in 5.7 secs)
5 variables at interpretation step (in 3.4 secs)
2 variables at prediction step (in 2.4 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "Duration.of.Residency" "Health.Insurance"
names(rfdata[,-1])[vsurf.mod$varselect.interp][1] "Duration.of.Residency" "Age" "Health.Insurance"
[4] "Dental.Insurance" "Income_median"
plot(vsurf.mod)
vsurf.mod$mean.perf[1] 0.2829858
Importance
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Duration.of.Residency 0.02969 0.00074 black
2 Age 0.02669 0.00075 black
3 Health.Insurance 0.02241 0.00063 black
4 Dental.Insurance 0.01358 0.00074 black
5 Income_median 0.00725 0.00045 black
6 Ethnicity 0.00676 0.00073 red
7 EnglishSpeak 0.00532 0.00052 black
8 Employment 0.00473 0.00036 black
9 EnglishDiff 0.00439 0.00050 black
10 Religion 0.00421 0.00052 black
11 Familiarity.with.America 0.00244 0.00049 black
12 Gender 0.00233 0.00044 black
13 Familiarity.with.Ethnic.Origin 0.00231 0.00038 black
14 Primary.Language 0.00107 0.00030 black
15 US.Born 0.00096 0.00014 black
16 Discrimination 0.00061 0.00027 black
17 Identify.Ethnically 0.00053 0.00037 black
18 Belonging 0.00016 0.00038 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)
ggsave(filename = "VSURF_importance_PC.png", width=12, height=8,units="in")Logistic regression (Interpretation)
lr <- rfdata |> select(Physical.Check.up,names(rfdata[,-1])[vsurf.mod$varselect.interp])
lr_mod <- glm(Physical.Check.up~.,family=binomial,data=lr)
summary(lr_mod)
Call:
glm(formula = Physical.Check.up ~ ., family = binomial, data = lr)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.928578 0.205890 -9.367 < 2e-16 ***
Duration.of.Residency 0.033459 0.005215 6.415 1.40e-10 ***
Age 0.017649 0.003775 4.675 2.94e-06 ***
Health.InsuranceYes 1.169682 0.159911 7.315 2.58e-13 ***
Dental.InsuranceYes 0.647069 0.123481 5.240 1.60e-07 ***
Income_medianAbove 0.270055 0.114061 2.368 0.0179 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2466.2 on 1965 degrees of freedom
Residual deviance: 2152.4 on 1960 degrees of freedom
AIC: 2164.4
Number of Fisher Scoring iterations: 4
car::Anova(lr_mod)Analysis of Deviance Table (Type II tests)
Response: Physical.Check.up
LR Chisq Df Pr(>Chisq)
Duration.of.Residency 43.282 1 4.738e-11 ***
Age 22.227 1 2.423e-06 ***
Health.Insurance 55.729 1 8.318e-14 ***
Dental.Insurance 27.267 1 1.772e-07 ***
Income_median 5.582 1 0.01815 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
broom::tidy(lr_mod,exponentiate=T,conf.int=T)|>
mutate(across(estimate:conf.high,~round(.x,3))) |>
DT::datatable()CV LASSO
x <- makeX(rfdata[,-1])
y <- rfdata$Physical.Check.up
cv.glmnet(x,y,family="binomial", type.measure = "auc")-> modcv
modcv$lambda.1se[1] 0.03886821
coef(modcv,s="lambda.1se")55 x 1 sparse Matrix of class "dgCMatrix"
s1
(Intercept) 4.458627e-01
EthnicityChinese .
EthnicityAsian Indian .
EthnicityFilipino .
EthnicityKorean -1.261200e-01
EthnicityOther .
EthnicityVietnamese .
Age 5.442383e-03
GenderFemale 1.135173e-01
GenderMale .
ReligionNone .
ReligionBuddhist .
ReligionCatholic .
ReligionHindu .
ReligionMuslim .
ReligionOther .
ReligionProtestant .
Employment0 .
EmploymentEmployed full time .
Income_medianBelow -8.947533e-03
Income_medianAbove 4.649018e-15
US.BornNo .
US.BornYes .
Duration.of.Residency 2.328732e-02
Primary.Language0 .
Primary.Language1 .
EnglishSpeakNot at all .
EnglishSpeakNot well .
EnglishSpeakVery well .
EnglishSpeakWell .
EnglishDiffNot at all .
EnglishDiffMuch .
EnglishDiffNot much .
EnglishDiffVery much .
Familiarity.with.AmericaVery low .
Familiarity.with.AmericaLow .
Familiarity.with.AmericaHigh .
Familiarity.with.AmericaVery high .
Familiarity.with.Ethnic.OriginVery low .
Familiarity.with.Ethnic.OriginLow .
Familiarity.with.Ethnic.OriginHigh .
Familiarity.with.Ethnic.OriginVery high .
Identify.EthnicallyNot at all .
Identify.EthnicallyNot very close .
Identify.EthnicallySomewhat close .
Identify.EthnicallyVery close .
BelongingNot at all .
BelongingNot very much .
BelongingSomewhat .
BelongingVery much .
Discrimination .
Health.Insurance0 -8.297677e-01
Health.InsuranceYes .
Dental.Insurance0 -4.033314e-01
Dental.InsuranceYes .
print(modcv)
Call: cv.glmnet(x = x, y = y, type.measure = "auc", family = "binomial")
Measure: AUC
Lambda Index Measure SE Nonzero
min 0.00877 30 0.7559 0.01248 28
1se 0.03887 14 0.7437 0.01008 8
Boruta
set.seed(123)
Boruta(Physical.Check.up~.,
data=rfdata,
doTrace=2,
) ->pc_boruta 1. run of importance source...
2. run of importance source...
3. run of importance source...
4. run of importance source...
5. run of importance source...
6. run of importance source...
7. run of importance source...
8. run of importance source...
9. run of importance source...
10. run of importance source...
11. run of importance source...
After 11 iterations, +2.7 secs:
confirmed 10 attributes: Age, Dental.Insurance, Duration.of.Residency, Employment, EnglishDiff and 5 more;
still have 8 attributes left.
12. run of importance source...
13. run of importance source...
14. run of importance source...
15. run of importance source...
After 15 iterations, +3.6 secs:
rejected 2 attributes: Discrimination, Identify.Ethnically;
still have 6 attributes left.
16. run of importance source...
17. run of importance source...
18. run of importance source...
19. run of importance source...
After 19 iterations, +4.4 secs:
confirmed 1 attribute: Familiarity.with.America;
still have 5 attributes left.
20. run of importance source...
21. run of importance source...
22. run of importance source...
23. run of importance source...
24. run of importance source...
25. run of importance source...
26. run of importance source...
27. run of importance source...
28. run of importance source...
29. run of importance source...
30. run of importance source...
31. run of importance source...
32. run of importance source...
33. run of importance source...
34. run of importance source...
After 34 iterations, +7.4 secs:
rejected 1 attribute: Belonging;
still have 4 attributes left.
35. run of importance source...
36. run of importance source...
37. run of importance source...
38. run of importance source...
39. run of importance source...
40. run of importance source...
41. run of importance source...
42. run of importance source...
43. run of importance source...
44. run of importance source...
45. run of importance source...
46. run of importance source...
47. run of importance source...
After 47 iterations, +9.9 secs:
confirmed 1 attribute: Primary.Language;
still have 3 attributes left.
48. run of importance source...
49. run of importance source...
50. run of importance source...
51. run of importance source...
52. run of importance source...
53. run of importance source...
54. run of importance source...
55. run of importance source...
56. run of importance source...
57. run of importance source...
58. run of importance source...
59. run of importance source...
60. run of importance source...
61. run of importance source...
62. run of importance source...
63. run of importance source...
64. run of importance source...
65. run of importance source...
66. run of importance source...
67. run of importance source...
68. run of importance source...
69. run of importance source...
70. run of importance source...
71. run of importance source...
72. run of importance source...
73. run of importance source...
After 73 iterations, +15 secs:
confirmed 1 attribute: Religion;
still have 2 attributes left.
74. run of importance source...
75. run of importance source...
76. run of importance source...
77. run of importance source...
78. run of importance source...
79. run of importance source...
80. run of importance source...
81. run of importance source...
82. run of importance source...
83. run of importance source...
84. run of importance source...
85. run of importance source...
86. run of importance source...
87. run of importance source...
88. run of importance source...
89. run of importance source...
90. run of importance source...
91. run of importance source...
92. run of importance source...
After 92 iterations, +18 secs:
confirmed 1 attribute: Familiarity.with.Ethnic.Origin;
still have 1 attribute left.
93. run of importance source...
94. run of importance source...
95. run of importance source...
96. run of importance source...
97. run of importance source...
98. run of importance source...
99. run of importance source...
plot(pc_boruta,las=2,cex.axis=0.5)
pc_borutaBoruta performed 99 iterations in 19.78841 secs.
14 attributes confirmed important: Age, Dental.Insurance,
Duration.of.Residency, Employment, EnglishDiff and 9 more;
3 attributes confirmed unimportant: Belonging, Discrimination,
Identify.Ethnically;
1 tentative attributes left: US.Born;
pc_boruta$finalDecision Ethnicity Age
Confirmed Confirmed
Gender Religion
Confirmed Confirmed
Employment Income_median
Confirmed Confirmed
US.Born Duration.of.Residency
Tentative Confirmed
Primary.Language EnglishSpeak
Confirmed Confirmed
EnglishDiff Familiarity.with.America
Confirmed Confirmed
Familiarity.with.Ethnic.Origin Identify.Ethnically
Confirmed Rejected
Belonging Discrimination
Rejected Rejected
Health.Insurance Dental.Insurance
Confirmed Confirmed
Levels: Tentative Confirmed Rejected
Naive Bayes Classifier
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 10,
## repeated ten times
repeats = 10)
cl <- makePSOCKcluster(10)
registerDoParallel(cl)
set.seed(825)
rf_fit <- train(Physical.Check.up ~ ., data = rfdata,
method = "naive_bayes",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE)
stopCluster(cl)
varImp(rf_fit)ROC curve variable importance
Importance
Duration.of.Residency 100.000
Dental.Insurance 75.670
Age 59.640
Health.Insurance 58.546
Income_median 48.264
EnglishDiff 36.790
Familiarity.with.America 32.617
Gender 31.000
Belonging 27.681
Primary.Language 25.326
Identify.Ethnically 25.209
EnglishSpeak 25.023
Employment 23.596
Familiarity.with.Ethnic.Origin 21.398
Discrimination 17.250
Ethnicity 7.396
Religion 5.464
US.Born 0.000
Dental Check-up
rfdata <- qol |> select(`Dentist Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |>
na.omit() |>
as.data.frame() |>
rename_with(make.names)
imbal <- ROSE::ROSE(Dentist.Check.up~.,
data=rfdata,
seed=3)$data
# VSURF(Dentist.Check.up~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
VSURF(Dentist.Check.up~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.modWarning in VSURF.formula(Dentist.Check.up ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 10.6 secs
VSURF selected:
18 variables at thresholding step (in 5.8 secs)
3 variables at interpretation step (in 3.5 secs)
3 variables at prediction step (in 1.4 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "Dental.Insurance" "Duration.of.Residency" "Religion"
names(rfdata[,-1])[vsurf.mod$varselect.interp][1] "Dental.Insurance" "Duration.of.Residency" "Religion"
plot(vsurf.mod)
vsurf.mod$mean.perf[1] 0.2955125
Importance
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Dental.Insurance 0.05183 0.00065 black
2 Duration.of.Residency 0.04928 0.00068 black
3 Religion 0.01610 0.00076 black
4 Ethnicity 0.01342 0.00088 red
5 Age 0.01150 0.00060 black
6 Health.Insurance 0.00817 0.00042 black
7 English.Speaking 0.00766 0.00045 black
8 Income_median 0.00676 0.00050 black
9 Identify.Ethnically 0.00378 0.00032 black
10 English.Difficulties 0.00325 0.00042 black
11 Familiarity.with.America 0.00286 0.00042 black
12 Familiarity.with.Ethnic.Origin 0.00282 0.00047 black
13 Belonging 0.00243 0.00046 black
14 Full.Time.Employment 0.00215 0.00035 black
15 Gender 0.00199 0.00035 black
16 US.Born 0.00178 0.00019 black
17 Primary.Language 0.00178 0.00028 black
18 Discrimination 0.00079 0.00025 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)
ggsave(filename = "VSURF_importance_Dc.png", width=12, height=8,units="in")Logistic regression (Interpretation)
lr <- rfdata |> select(Dentist.Check.up,names(rfdata[,-1])[vsurf.mod$varselect.interp])
lr_mod <- glm(Dentist.Check.up~.,family=binomial,data=lr)
summary(lr_mod)
Call:
glm(formula = Dentist.Check.up ~ ., family = binomial, data = lr)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.120303 0.142888 -7.840 4.49e-15 ***
Dental.InsuranceYes 1.631483 0.107046 15.241 < 2e-16 ***
Duration.of.Residency 0.045181 0.004657 9.701 < 2e-16 ***
ReligionBuddhist 0.139163 0.189583 0.734 0.462920
ReligionCatholic 0.012325 0.170281 0.072 0.942301
ReligionHindu -0.654930 0.168546 -3.886 0.000102 ***
ReligionMuslim -0.538116 0.338322 -1.591 0.111712
ReligionOther -0.678784 0.408196 -1.663 0.096335 .
ReligionProtestant -0.122292 0.156091 -0.783 0.433356
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2659.6 on 1960 degrees of freedom
Residual deviance: 2209.7 on 1952 degrees of freedom
AIC: 2227.7
Number of Fisher Scoring iterations: 4
car::Anova(lr_mod)Analysis of Deviance Table (Type II tests)
Response: Dentist.Check.up
LR Chisq Df Pr(>Chisq)
Dental.Insurance 251.011 1 < 2.2e-16 ***
Duration.of.Residency 102.856 1 < 2.2e-16 ***
Religion 27.422 6 0.0001207 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
broom::tidy(lr_mod,exponentiate=T,conf.int=T)|>
mutate(across(estimate:conf.high,~round(.x,3))) |>
DT::datatable()CV LASSO
x <- makeX(rfdata[,-1])
y <- rfdata$Dentist.Check.up
cv.glmnet(x,y,family="binomial", type.measure = "auc")-> modcv
modcv$lambda.1se[1] 0.1271423
coef(modcv,s="lambda.1se")55 x 1 sparse Matrix of class "dgCMatrix"
s1
(Intercept) 4.898908e-01
EthnicityChinese .
EthnicityAsian Indian .
EthnicityFilipino .
EthnicityKorean .
EthnicityOther .
EthnicityVietnamese .
Age .
GenderFemale .
GenderMale .
ReligionNone .
ReligionBuddhist .
ReligionCatholic .
ReligionHindu .
ReligionMuslim .
ReligionOther .
ReligionProtestant .
Full.Time.Employment0 .
Full.Time.EmploymentEmployed full time .
Income_medianBelow .
Income_medianAbove .
US.BornNo .
US.BornYes .
Duration.of.Residency 3.132535e-03
Primary.Language0 .
Primary.Language1 .
English.SpeakingNot at all .
English.SpeakingNot well .
English.SpeakingVery well .
English.SpeakingWell .
English.DifficultiesNot at all .
English.DifficultiesMuch .
English.DifficultiesNot much .
English.DifficultiesVery much .
Familiarity.with.AmericaVery low .
Familiarity.with.AmericaLow .
Familiarity.with.AmericaHigh .
Familiarity.with.AmericaVery high .
Familiarity.with.Ethnic.OriginVery low .
Familiarity.with.Ethnic.OriginLow .
Familiarity.with.Ethnic.OriginHigh .
Familiarity.with.Ethnic.OriginVery high .
Identify.EthnicallyNot at all .
Identify.EthnicallyNot very close .
Identify.EthnicallySomewhat close .
Identify.EthnicallyVery close .
BelongingNot at all .
BelongingNot very much .
BelongingSomewhat .
BelongingVery much .
Discrimination .
Health.Insurance0 .
Health.InsuranceYes .
Dental.Insurance0 -4.719636e-01
Dental.InsuranceYes 1.567321e-13
print(modcv)
Call: cv.glmnet(x = x, y = y, type.measure = "auc", family = "binomial")
Measure: AUC
Lambda Index Measure SE Nonzero
min 0.01363 29 0.7788 0.01055 20
1se 0.12714 5 0.7682 0.01208 3
Boruta
set.seed(123)
Boruta(Dentist.Check.up~.,
data=rfdata,
doTrace=2,
) ->pc_boruta 1. run of importance source...
2. run of importance source...
3. run of importance source...
4. run of importance source...
5. run of importance source...
6. run of importance source...
7. run of importance source...
8. run of importance source...
9. run of importance source...
10. run of importance source...
11. run of importance source...
After 11 iterations, +2.4 secs:
confirmed 11 attributes: Age, Dental.Insurance, Duration.of.Residency, English.Speaking, Ethnicity and 6 more;
still have 7 attributes left.
12. run of importance source...
13. run of importance source...
14. run of importance source...
15. run of importance source...
After 15 iterations, +3.3 secs:
confirmed 2 attributes: English.Difficulties, Primary.Language;
still have 5 attributes left.
16. run of importance source...
17. run of importance source...
18. run of importance source...
19. run of importance source...
20. run of importance source...
21. run of importance source...
22. run of importance source...
23. run of importance source...
24. run of importance source...
25. run of importance source...
26. run of importance source...
27. run of importance source...
28. run of importance source...
29. run of importance source...
30. run of importance source...
31. run of importance source...
32. run of importance source...
33. run of importance source...
34. run of importance source...
35. run of importance source...
36. run of importance source...
37. run of importance source...
38. run of importance source...
39. run of importance source...
After 39 iterations, +8.6 secs:
rejected 1 attribute: Discrimination;
still have 4 attributes left.
40. run of importance source...
41. run of importance source...
42. run of importance source...
43. run of importance source...
44. run of importance source...
45. run of importance source...
46. run of importance source...
47. run of importance source...
48. run of importance source...
49. run of importance source...
50. run of importance source...
51. run of importance source...
52. run of importance source...
53. run of importance source...
After 53 iterations, +11 secs:
rejected 1 attribute: Belonging;
still have 3 attributes left.
54. run of importance source...
55. run of importance source...
56. run of importance source...
57. run of importance source...
58. run of importance source...
59. run of importance source...
60. run of importance source...
61. run of importance source...
62. run of importance source...
63. run of importance source...
64. run of importance source...
65. run of importance source...
66. run of importance source...
67. run of importance source...
68. run of importance source...
69. run of importance source...
70. run of importance source...
71. run of importance source...
72. run of importance source...
73. run of importance source...
74. run of importance source...
75. run of importance source...
After 75 iterations, +16 secs:
confirmed 1 attribute: Full.Time.Employment;
rejected 1 attribute: Familiarity.with.Ethnic.Origin;
still have 1 attribute left.
76. run of importance source...
77. run of importance source...
78. run of importance source...
79. run of importance source...
80. run of importance source...
81. run of importance source...
82. run of importance source...
83. run of importance source...
84. run of importance source...
85. run of importance source...
86. run of importance source...
87. run of importance source...
88. run of importance source...
89. run of importance source...
90. run of importance source...
91. run of importance source...
92. run of importance source...
93. run of importance source...
94. run of importance source...
95. run of importance source...
96. run of importance source...
97. run of importance source...
98. run of importance source...
99. run of importance source...
plot(pc_boruta,las=2,cex.axis=0.5)
pc_borutaBoruta performed 99 iterations in 20.35846 secs.
14 attributes confirmed important: Age, Dental.Insurance,
Duration.of.Residency, English.Difficulties, English.Speaking and 9
more;
3 attributes confirmed unimportant: Belonging, Discrimination,
Familiarity.with.Ethnic.Origin;
1 tentative attributes left: Identify.Ethnically;
pc_boruta$finalDecision Ethnicity Age
Confirmed Confirmed
Gender Religion
Confirmed Confirmed
Full.Time.Employment Income_median
Confirmed Confirmed
US.Born Duration.of.Residency
Confirmed Confirmed
Primary.Language English.Speaking
Confirmed Confirmed
English.Difficulties Familiarity.with.America
Confirmed Confirmed
Familiarity.with.Ethnic.Origin Identify.Ethnically
Rejected Tentative
Belonging Discrimination
Rejected Rejected
Health.Insurance Dental.Insurance
Confirmed Confirmed
Levels: Tentative Confirmed Rejected
Naive Bayes Classifier
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 10,
## repeated ten times
repeats = 10)
cl <- makePSOCKcluster(10)
registerDoParallel(cl)
set.seed(825)
rf_fit <- train(Dentist.Check.up ~ ., data = rfdata,
method = "naive_bayes",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE)
stopCluster(cl)
varImp(rf_fit)ROC curve variable importance
Importance
Duration.of.Residency 100.000
Dental.Insurance 96.639
Income_median 53.830
Familiarity.with.America 49.128
Health.Insurance 38.691
English.Difficulties 37.429
Primary.Language 30.509
Age 30.053
Gender 26.747
Full.Time.Employment 21.299
English.Speaking 19.358
Discrimination 14.236
Identify.Ethnically 14.044
Religion 13.809
Belonging 12.067
US.Born 11.577
Familiarity.with.Ethnic.Origin 4.759
Ethnicity 0.000
Folkmedicine
rfdata <- qol |> select(`Folkmedicine`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |>
na.omit() |>
as.data.frame() |>
rename_with(make.names)
imbal <- ROSE::ROSE(Folkmedicine~.,
data=rfdata,
seed=3)$data
# VSURF(Folkmedicine~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
VSURF(Folkmedicine~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.modWarning in VSURF.formula(Folkmedicine ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 9.8 secs
VSURF selected:
18 variables at thresholding step (in 4.8 secs)
3 variables at interpretation step (in 3.2 secs)
2 variables at prediction step (in 1.8 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "Age" "Ethnicity"
names(rfdata[,-1])[vsurf.mod$varselect.interp][1] "Age" "Duration.of.Residency" "Ethnicity"
plot(vsurf.mod)
vsurf.mod$mean.perf[1] 0.1380329
Importance
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Age 0.01728 0.00085 black
2 Duration.of.Residency 0.01354 0.00073 black
3 Ethnicity 0.01305 0.00050 red
4 English.Speaking 0.00905 0.00062 black
5 Religion 0.00703 0.00051 black
6 Income_median 0.00559 0.00041 black
7 English.Difficulties 0.00472 0.00053 black
8 Primary.Language 0.00262 0.00043 black
9 Familiarity.with.Ethnic.Origin 0.00259 0.00022 black
10 Familiarity.with.America 0.00257 0.00040 black
11 Full.Time.Employment 0.00246 0.00032 black
12 Discrimination 0.00126 0.00040 black
13 Dental.Insurance 0.00122 0.00024 black
14 Health.Insurance 0.00079 0.00015 black
15 US.Born 0.00073 0.00023 black
16 Identify.Ethnically 0.00073 0.00034 black
17 Gender 0.00059 0.00020 black
18 Belonging 0.00040 0.00033 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)
ggsave(filename = "VSURF_importance_Alt.png", width=12, height=8,units="in")Logistic regression (Interpretation)
lr <- rfdata |> select(Folkmedicine,names(rfdata[,-1])[vsurf.mod$varselect.interp])
lr_mod <- glm(Folkmedicine~.,family=binomial,data=lr)
summary(lr_mod)
Call:
glm(formula = Folkmedicine ~ ., family = binomial, data = lr)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.612602 0.219844 -11.884 < 2e-16 ***
Age 0.022611 0.004636 4.877 1.08e-06 ***
Duration.of.Residency 0.007417 0.005960 1.244 0.213371
EthnicityAsian Indian -0.809613 0.218076 -3.713 0.000205 ***
EthnicityFilipino -0.742468 0.272122 -2.728 0.006364 **
EthnicityKorean 0.234543 0.173006 1.356 0.175197
EthnicityOther -0.735426 0.357019 -2.060 0.039408 *
EthnicityVietnamese -1.084595 0.232545 -4.664 3.10e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1552.9 on 1946 degrees of freedom
Residual deviance: 1450.9 on 1939 degrees of freedom
AIC: 1466.9
Number of Fisher Scoring iterations: 5
car::Anova(lr_mod)Analysis of Deviance Table (Type II tests)
Response: Folkmedicine
LR Chisq Df Pr(>Chisq)
Age 23.485 1 1.259e-06 ***
Duration.of.Residency 1.544 1 0.2141
Ethnicity 56.750 5 5.693e-11 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
broom::tidy(lr_mod,exponentiate=T,conf.int=T)|>
mutate(across(estimate:conf.high,~round(.x,3))) |>
DT::datatable()CV LASSO
x <- makeX(rfdata[,-1])
y <- rfdata$Folkmedicine
cv.glmnet(x,y,family="binomial", type.measure = "auc")-> modcv
modcv$lambda.1se[1] 0.02076259
coef(modcv,s="lambda.1se")55 x 1 sparse Matrix of class "dgCMatrix"
s1
(Intercept) -2.480267e+00
EthnicityChinese 1.445335e-01
EthnicityAsian Indian .
EthnicityFilipino .
EthnicityKorean 3.345835e-01
EthnicityOther .
EthnicityVietnamese -8.517757e-02
Age 1.238592e-02
GenderFemale .
GenderMale .
ReligionNone .
ReligionBuddhist .
ReligionCatholic .
ReligionHindu .
ReligionMuslim .
ReligionOther .
ReligionProtestant 1.573109e-01
Full.Time.Employment0 2.915342e-02
Full.Time.EmploymentEmployed full time -7.843686e-16
Income_medianBelow .
Income_medianAbove .
US.BornNo .
US.BornYes .
Duration.of.Residency 1.609651e-03
Primary.Language0 .
Primary.Language1 .
English.SpeakingNot at all .
English.SpeakingNot well .
English.SpeakingVery well -2.241479e-01
English.SpeakingWell .
English.DifficultiesNot at all .
English.DifficultiesMuch .
English.DifficultiesNot much .
English.DifficultiesVery much .
Familiarity.with.AmericaVery low .
Familiarity.with.AmericaLow .
Familiarity.with.AmericaHigh .
Familiarity.with.AmericaVery high .
Familiarity.with.Ethnic.OriginVery low .
Familiarity.with.Ethnic.OriginLow .
Familiarity.with.Ethnic.OriginHigh .
Familiarity.with.Ethnic.OriginVery high .
Identify.EthnicallyNot at all .
Identify.EthnicallyNot very close .
Identify.EthnicallySomewhat close .
Identify.EthnicallyVery close .
BelongingNot at all .
BelongingNot very much .
BelongingSomewhat .
BelongingVery much .
Discrimination .
Health.Insurance0 .
Health.InsuranceYes .
Dental.Insurance0 .
Dental.InsuranceYes .
print(modcv)
Call: cv.glmnet(x = x, y = y, type.measure = "auc", family = "binomial")
Measure: AUC
Lambda Index Measure SE Nonzero
min 0.002943 32 0.6842 0.01542 33
1se 0.020763 11 0.6694 0.02331 9
Boruta
set.seed(123)
Boruta(Folkmedicine~.,
data=rfdata,
doTrace=2,
) ->pc_boruta 1. run of importance source...
2. run of importance source...
3. run of importance source...
4. run of importance source...
5. run of importance source...
6. run of importance source...
7. run of importance source...
8. run of importance source...
9. run of importance source...
10. run of importance source...
11. run of importance source...
After 11 iterations, +2.1 secs:
confirmed 7 attributes: Age, Duration.of.Residency, English.Speaking, Ethnicity, Familiarity.with.America and 2 more;
still have 11 attributes left.
12. run of importance source...
13. run of importance source...
14. run of importance source...
15. run of importance source...
After 15 iterations, +2.8 secs:
confirmed 2 attributes: Income_median, Religion;
still have 9 attributes left.
16. run of importance source...
17. run of importance source...
18. run of importance source...
19. run of importance source...
After 19 iterations, +3.6 secs:
confirmed 1 attribute: English.Difficulties;
rejected 1 attribute: Belonging;
still have 7 attributes left.
20. run of importance source...
21. run of importance source...
22. run of importance source...
After 22 iterations, +4.1 secs:
rejected 1 attribute: Gender;
still have 6 attributes left.
23. run of importance source...
24. run of importance source...
25. run of importance source...
After 25 iterations, +4.6 secs:
confirmed 1 attribute: Familiarity.with.Ethnic.Origin;
still have 5 attributes left.
26. run of importance source...
27. run of importance source...
28. run of importance source...
After 28 iterations, +5.1 secs:
confirmed 2 attributes: Dental.Insurance, Discrimination;
still have 3 attributes left.
29. run of importance source...
30. run of importance source...
31. run of importance source...
32. run of importance source...
33. run of importance source...
34. run of importance source...
35. run of importance source...
36. run of importance source...
37. run of importance source...
After 37 iterations, +6.6 secs:
confirmed 1 attribute: Health.Insurance;
still have 2 attributes left.
38. run of importance source...
39. run of importance source...
40. run of importance source...
41. run of importance source...
42. run of importance source...
43. run of importance source...
44. run of importance source...
45. run of importance source...
46. run of importance source...
47. run of importance source...
48. run of importance source...
49. run of importance source...
50. run of importance source...
51. run of importance source...
52. run of importance source...
53. run of importance source...
54. run of importance source...
55. run of importance source...
56. run of importance source...
57. run of importance source...
58. run of importance source...
59. run of importance source...
60. run of importance source...
61. run of importance source...
62. run of importance source...
63. run of importance source...
64. run of importance source...
65. run of importance source...
66. run of importance source...
67. run of importance source...
68. run of importance source...
69. run of importance source...
70. run of importance source...
71. run of importance source...
72. run of importance source...
73. run of importance source...
74. run of importance source...
75. run of importance source...
76. run of importance source...
77. run of importance source...
78. run of importance source...
79. run of importance source...
80. run of importance source...
81. run of importance source...
82. run of importance source...
83. run of importance source...
After 83 iterations, +14 secs:
rejected 1 attribute: Identify.Ethnically;
still have 1 attribute left.
84. run of importance source...
85. run of importance source...
86. run of importance source...
87. run of importance source...
88. run of importance source...
89. run of importance source...
90. run of importance source...
91. run of importance source...
92. run of importance source...
93. run of importance source...
94. run of importance source...
95. run of importance source...
96. run of importance source...
97. run of importance source...
98. run of importance source...
99. run of importance source...
plot(pc_boruta,las=2,cex.axis=0.5)
pc_borutaBoruta performed 99 iterations in 16.96238 secs.
14 attributes confirmed important: Age, Dental.Insurance,
Discrimination, Duration.of.Residency, English.Difficulties and 9 more;
3 attributes confirmed unimportant: Belonging, Gender,
Identify.Ethnically;
1 tentative attributes left: US.Born;
pc_boruta$finalDecision Ethnicity Age
Confirmed Confirmed
Gender Religion
Rejected Confirmed
Full.Time.Employment Income_median
Confirmed Confirmed
US.Born Duration.of.Residency
Tentative Confirmed
Primary.Language English.Speaking
Confirmed Confirmed
English.Difficulties Familiarity.with.America
Confirmed Confirmed
Familiarity.with.Ethnic.Origin Identify.Ethnically
Confirmed Rejected
Belonging Discrimination
Rejected Confirmed
Health.Insurance Dental.Insurance
Confirmed Confirmed
Levels: Tentative Confirmed Rejected
Naive Bayes Classifier
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 10,
## repeated ten times
repeats = 10)
cl <- makePSOCKcluster(10)
registerDoParallel(cl)
set.seed(825)
rf_fit <- train(Folkmedicine ~ ., data = rfdata,
method = "naive_bayes",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE)Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
extra argument 'verbose' will be disregarded
stopCluster(cl)
varImp(rf_fit)ROC curve variable importance
Importance
Age 100.000
Duration.of.Residency 61.198
Full.Time.Employment 49.606
Belonging 40.543
Familiarity.with.America 39.330
Religion 39.037
Primary.Language 35.805
Gender 30.127
Ethnicity 29.581
Discrimination 24.699
Identify.Ethnically 22.885
English.Speaking 14.384
US.Born 10.342
Familiarity.with.Ethnic.Origin 4.550
Dental.Insurance 2.944
Health.Insurance 2.618
English.Difficulties 0.779
Income_median 0.000